/*
 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.tribuo.util.tokens;

/**
 * A single token extracted from a String.
 * <p>
 * Tokens are immutable, and may be records one day.
 */
public class Token {

    /**
     * The token text.
     */
    public final String text;
    /**
     * The start index.
     */
    public final int start;
    /**
     * The end index.
     */
    public final int end;
    /**
     * The token type.
     */
    public final TokenType type;

    /**
     * Constructs a token.
     * 
     * @param text  should be equivalent to the substring of the original tokenized
     *              text for the given character offsets start and end
     * @param start the starting offset of the token
     * @param end   the ending offset of the token (exclusive or inclusive?)
     */
    public Token(String text, int start, int end) {
        this(text, start, end, TokenType.WORD);
    }

    /**
     * Constructs a token.
     * 
     * @param text  should be equivalent to the substring of the original tokenized
     *              text for the given character offsets start and end
     * @param start the starting offset of the token
     * @param end   the ending offset of the token (exclusive or inclusive?)
     * @param type  the type of the token
     */
    public Token(String text, int start, int end, TokenType type) {
        this.text = text;
        this.start = start;
        this.end = end;
        this.type = type;
    }

    /**
     * The number of characters in this token.
     * 
     * @return The number of characters.
     */
    public int length() {
        return this.end - this.start;
    }

    @Override
    public String toString() {
        return this.text + "[type=" + this.type + "," + this.start + "," + this.end + "]";
    }

    /**
     * Tokenizers may product multiple kinds of tokens, depending on the application
     * to which they're being put. For example, when processing a document for
     * highlighting during querying, we need to send through whitespace and
     * punctuation so that the document looks as it did in it's original form. For
     * most tokenizer applications, they will only send word tokens.
     */
    public enum TokenType {
        /**
         * A WORD corresponds to a token that does not consist of or contain whitespace
         * and may correspond to a regular "word" that could be looked up in a
         * dictionary. Some tokenizers do not distinguish between different kinds of
         * tokens and may use this as a default type for all generated tokens.
         */
        WORD,
        /**
         * An NGRAM corresponds to a token that might correspond to a character ngram -
         * i.e. some portion / sub-span of a regular word token (for example.)
         */
        NGRAM,
        /**
         * A PUNCTUATION corresponds to tokens consisting of punctuation characters. In
         * some applications, a PUNCTUATION may be treated differently because they may
         * have less semantic content than regular word tokens.
         */
        PUNCTUATION,
        /**
         * Some tokenizers may produce tokens corresponding to whitespace (e.g. space,
         * tab, newline, etc.) It may be important for consumers of tokens generated by
         * a tokenizer to ignore/skip WHITESPACE tokens to avoid unexpected behavior.
         */
        WHITESPACE,
        /**
         * Some tokenizers produce "sub-word" tokens. A PREFIX corresponds to a sub-word
         * word-prefix token.
         */
        PREFIX,
        /**
         * Some tokenizers produce "sub-word" tokens. A SUFFIX corresponds to a sub-word
         * word-suffix token.
         */
        SUFFIX,
        /**
         * Some tokenizers produce "sub-word" tokens. An INFIX corresponds to a sub-word
         * "infix" token (i.e. from the middle).
         */
        INFIX,
        /**
         * Some tokenizers may work in concert with vocabulary data. Some applications
         * may treat out-of-vocabulary tokens differently than other tokens. An UNKNOWN
         * token corresponds to a token that is out-of-vocabulary or has never been seen
         * before.
         */
        UNKNOWN
    }

}
